library(knitr)
library(tidyverse)
library(nhsrtheme)
library(icons)
library(xaringanExtra)
library(dataedu)
library(lubridate)

# set default options
opts_chunk$set(fig.width = 7.252,
               fig.height = 4,
               dpi = 300)

xaringanExtra::use_panelset()

all_files <- dataedu::all_files
child_counts <- dataedu::child_counts
library(metathis)
meta() %>%
  meta_name("github-repo" = "ivelasq/") %>% 
  meta_social(
    title = "Longitudinal Analysis With Federal Students With Disabilities Data",
    description = paste(
      "Slides for R4DS Data Science in Education Bookclub"
    ),
    url = "https://github.com/r4ds/bookclub-dsieur",
    og_type = "website",
    og_author = "Isabella Velásquez",
    twitter_card_type = "summary_large_image",
    twitter_creator = "@ivelasq",
    twitter_site = "@ivelasq"
  )

class: title-slide, left, bottom, inverse background-image: url(img/title.jpg) background-size: cover

.pull-left[

r rmarkdown::metadata$title


r rmarkdown::metadata$subtitle

r rmarkdown::metadata$author

r rmarkdown::metadata$date

]

.palegrey[.right[.footnote[Photo by Ave Calvar]]]


class: center, middle, inverse

Topics Introduced

--


.center[ r icons::fontawesome$solid$table Importing ]

--

.center[ r icons::fontawesome$solid$broom Tidying ]

--

.center[ r icons::fontawesome$solid$random Transforming ]

--

.center[ r icons::fontawesome$solid$eye Visualizing ]

--

.center[ r icons::fontawesome$solid$question Modeling ]

--

.center[ r icons::fontawesome$solid$comment Communicating ]


Functions & Vocabulary

--

.pull-left[

Functions

]

--

.pull-right[

Vocabulary

]


Chapter Overview

.left-column[

Approaches to longitudinal analysis

Analysis or study of data with more than one time point with repeated measures over time

]

--

.right-column[ Repeated cross-sections

Panel datasets

Time series analysis

Cohort studies

Event history datasets

What type of analysis do you usually run?

When do you look at data over time in your work?

.palegrey[.right[.footnote[Source]]] ]


class: title-slide, left, bottom background-image: url(img/section.jpg) background-size: cover

Let's work with some data

.small[ Photo by Joanna Kosinska]


background-color: #006876 class: center, middle, inverse

r icons::fontawesome$solid$table


Import Data


Loading Packages & Handling Conflicts

.left-column[

library(tidyverse)
library(dataedu)
library(lubridate)
library(here)

]

.right-column[

Special Note on here()

here() could be from multiple packages {plyr}, {here}

Use :: to declare the NAMESPACE and specify which package you'd like to use: here::here()

Have you ever run into package conflicts? How did you resolve them?

]


Importing Data

How else have you imported data?

.panelset[ .panel[.panel-name[{dataedu} Package]

dataedu::bchildcountandedenvironments2012 # individual datasets
dataedu::all_files # combined files in list format
dataedu::child_counts # combined dataset in data frame format (but this skips processing)

]

.panel[.panel-name[Individual Source URL]

# must be done for each file

download.file(
  # the url argument takes a URL for a CSV file
  url = 'https://bit.ly/3dCtVtf', 
  # destfile specifies where the file should be saved
  destfile = here::here("data",
           "longitudinal_data",
           "bchildcountandedenvironments2012.csv"), 
  mode = "wb")

]

.panel[.panel-name[Individual CSV Files]

# must be done for each file
# data must be in corresponding folder

read_csv(here::here(
  "data",
  "longitudinal_data",
  "bchildcountandedenvironments2012.csv"
),
skip = 4)

]

.panel[.panel-name[Multiple CSV Files]

# data must be in corresponding folder
filenames <-
  list.files(path = here::here("data", "longitudinal_data"),
             full.names = TRUE)

all_files <-
  filenames %>%
  # Apply the function read_csv to each element of filenames
  map(., ~ read_csv(., skip = 4))

]

.panel[.panel-name[Other Options]

library(glue)
library(here)
library(tidyverse)

repo <- "https://github.com/data-edu/data-science-in-education/"
data <- "blob/master/data/longitudinal_data/"
filenames <- glue("bchildcountandedenvironments{rep(2012:2017)}")
filenames[6] <- glue("{filenames[6]}-18")
filenames <- glue("{filenames}.csv")
urls <- glue("{repo}{data}{filenames}")
local_filenames <- glue("{here('Desktop', 'data', 'longitudinal_data')}/{filenames}")

walk2(urls, local_filenames, ~ download.file(url = .x, destfile = .y, mode = "w"))

] ]


background-color: #006876 class: inverse

.left-column[

Process Data
r icons::fontawesome$regular$edit

]

.right-column[ 1. Fix the variable names in the 2016 data 2. Combine the datasets 3. Pick variables 4. Filter for the desired categories 5. Rename the variables 6. Standardize the state names 7. Transform the column formats from wide to long using pivot_longer 8. Change the data types of variables 9. Explore NAs

.small[ Illustration by Allison Horst] ]


Process Data Part 1

.panelset[ .panel[.panel-name[Fix the Variable Names]

When we print the 2016 dataset, we notice that the variable names are incorrect. Let’s verify that by looking at the first ten variable names of the 2016 dataset, which is the fifth element of all_files:

names(all_files[[5]])[1:10]

To fix it, we’ll read the 2016 dataset again using read_csv() and the fifth element of filenames but this time will use the argument skip = 3. We’ll assign the newly read dataset to the fifth element of the all_files list:

all_files[[5]] <-
  # Skip the first 3 lines instead of the first 4
  read_csv(filenames[[5]], skip = 3)

]

.panel[.panel-name[Pick Variables 1]

all_files[[1]] %>%
  select(
    Year,
    contains("State", ignore.case = FALSE),
    contains("SEA", ignore.case = FALSE),
    contains("male")
  )

]

.panel[.panel-name[Pick Variables 2]

# build the function
pick_vars <-
  function(df) {
    df %>%
      select_at(vars(
        Year,
        contains("State", ignore.case = FALSE),
        contains("SEA", ignore.case = FALSE),
        contains("male")
      ))
  }

# use the function with `all_files`
all_files <-
  all_files %>%
  map(pick_vars)

]

.panel[.panel-name[Combine Datasets]

child_counts <-
  all_files %>%
  # combine all datasets in `all_files`
  bind_rows()

] ]


Filter Data

.panelset[.panel[.panel-name[Look at Variables]

child_counts %>%
  # count number of times the category appears in the dataset
  count(`SEA Disability Category`) %>% 
  head(5)

]

.panel[.panel-name[Choose Category]

Since we will be visualizing and modeling gender variables for all students in the dataset, we’ll filter out all subgroups except “All Disabilities” and the age totals:

child_counts <-
  child_counts %>%
  filter(
    # filter all but the All Disabilities category
    `SEA Disability Category` == "All Disabilities",
    # filter all but the age totals
    `SEA Education Environment` %in% c("Total, Age 3-5", "Total, Age 6-21")
  ) 

] ]


Clean Data

.panelset[ .panel[.panel-name[Rename Variables]

child_counts <-
  child_counts %>%
  rename(
    # change these columns to more convenient names
    year = Year,
    state = "State Name",
    age = "SEA Education Environment",
    disability = "SEA Disability Category",
    f_3_5 = "Female Age 3 to 5",
    m_3_5 = "Male Age 3 to 5",
    f_6_21 = "Female Age 6 to 21",
    m_6_21 = "Male Age 6 to 21"
  )

]

.panel[.panel-name[Edit Data]

child_counts %>%
  count(state) %>%
  head(4)
child_counts <-
  child_counts %>%
  mutate(state = tolower(state))

]

]


Tidy Data

.left-column[ Tidy data is a specific way of organizing data into a consistent format which plugs into the tidyverse set of packages for R.

Resources

.right-column[

.small[ Illustration by Allison Horst] ] ]


Tidy Data

child_counts <-
  child_counts %>%
    pivot_longer(cols = f_3_5:m_6_21, 
                 names_to = "gender", 
                 values_to = "total")
child_counts %>% 
  head(10)

Finish Processing Data

.panelset[ .panel[.panel-name[Create gender Variable]

child_counts <- 
  child_counts %>%
  mutate(
    gender = case_when(
      gender == "f_3_5" ~ "f",
      gender == "m_3_5" ~ "m",
      gender == "f_6_21" ~ "f",
      gender == "m_6_21" ~ "m",
      TRUE ~ as.character(gender)))

child_counts %>% head(5)

]

.panel[.panel-name[Convert Types]

child_counts <-
  child_counts %>%
  mutate(total = as.numeric(total),
         year = ymd(year, truncated = 2))

child_counts %>% head(5)

]

.panel[.panel-name[Remove NA's]

child_counts <-
  child_counts %>%
  filter(!is.na(total)) 

What do you consider when removing NA's?

How else have you removed NA's in the past?

]

.panel[.panel-name[Further Filtering]

child_counts %>%
  group_by(state) %>%
  summarize(mean_count = mean(total, na.rm = TRUE)) %>%
  # which six states have the highest mean count of students with disabilities
  top_n(6, mean_count)
# filter to top states and create a new object
high_count <-
  child_counts %>%
  filter(state %in% c("california", "florida", "new york", "pennsylvania", "texas"))

] ]


class: title-slide, left, top background-image: url(img/process.png) background-size: cover


background-color: #006876 class: center, middle, inverse

r icons::fontawesome$solid$brain

Analyze Data


Visualize the Data


Count of Female Students in Special Education Over Time

.pull-left[

Code


high_count %>%
  filter(gender == "f", age == "Total, Age 6-21") %>%
  ggplot(aes(x = year, y = total, color = state)) +
  geom_freqpoly(stat = "identity", size = 1) +
  labs(title = "Count of Female Students in Special Education Over Time",
       subtitle = "Ages 6-21") +
  scale_color_dataedu() +
  theme_dataedu()

]

.pull-right[

Plot



]


Count of Male Students in Special Education Over Time

.pull-left[

Code


high_count %>%
  filter(gender == "m", age == "Total, Age 6-21") %>%
  ggplot(aes(x = year, y = total, color = state)) +
  geom_freqpoly(stat = "identity", size = 1) +
  labs(title = "Count of Male Students in Special Education Over Time",
       subtitle = "Ages 6-21") +
  scale_color_dataedu() +
  theme_dataedu()

]

.pull-right[

Plot



]


Total Count of Students in Special Education Over Time

.pull-left[

Code


Add age and gender values together using group_by() and summarize().

high_count %>%
  group_by(year, state) %>%
  summarize(n = sum(total)) %>%
  ggplot(aes(x = year, y = n, color = state)) +
  geom_freqpoly(stat = "identity", size = 1) +
  labs(title = "Total Count of Students in Special Education Over Time",
       subtitle = "Ages 3-21") +
  scale_color_dataedu() +
  theme_dataedu()

]

.pull-right[

Plot



]


Median Students with Disabilities Count

.pull-left[

Code


high_count %>%
  group_by(year, state) %>%
  summarize(n = sum(total)) %>%
  ggplot(aes(x = state, y = n)) +
  geom_boxplot(fill = dataedu_colors("yellow")) +
  labs(title = "Median Students with Disabilities Count",
       subtitle = "All ages and genders, 2012-2017") +
  theme_dataedu() 

]

.pull-right[

Plot



]


Male Student to Female Student Ratio Over Time

.pull-left[

Code


Compare the male student count to the female student count using a ratio.

high_count %>%
  group_by(year, state, gender) %>%
  summarize(total = sum(total)) %>%
  # Create new columns for male and female student counts
  pivot_wider(names_from = gender, 
              values_from = total) %>% 
  # Create a new ratio column
  mutate(ratio = m / f) %>%
  ggplot(aes(x = year, y = ratio, color = state)) +
  geom_freqpoly(stat = "identity", size = 1) +
  scale_y_continuous(limits = c(1.5, 2.5)) +
  labs(title = "Male Student to Female Student Ratio Over Time",
       subtitle = "Ages 6-21") +
  scale_color_dataedu() +
  theme_dataedu()

]

.pull-right[

Plot



]


background-color: #006876 class: center, middle, inverse

r icons::fontawesome$solid$brain

Analyze Data


Model the Data

--

We'll explore the relationship between male and female student counts in our special education dataset. In particular, we'll test the hypothesis that this ratio has decreased over the years. Fitting a linear regression model that estimates the year as a predictor of the male to female ratio will help us do just that.


Check the data we'll use for the model

.panelset[ .panel[.panel-name[Visualize Data] .pull-left[

Code


child_counts %>%
  filter(age == "Total, Age 6-21") %>%
  pivot_wider(names_from = gender, 
              values_from = total) %>%
  ggplot(aes(x = f, y = m)) +
  geom_point(size = 3, alpha = .5, color = dataedu_colors("green")) +
  geom_smooth() +
  labs(
    title = "Comparison of Female Students to Male Students in Special Education",
    subtitle = "Counts of students in each state, ages 6-21",
    x = "Female students",
    y = "Male students",
    caption = "Data: US Dept of Education"
  ) +
  theme_dataedu()

]

.pull-right[

Plot



] ]

.panel[.panel-name[Check Data]

child_counts %>%
  filter(age == "Total, Age 6-21") %>%
  pivot_wider(names_from = gender, 
              values_from = total) %>% 
  filter(f > 500000) %>%
  select(year, state, age, f, m) %>% 
  head(5)

]

.panel[.panel-name[Filter Data] .pull-left[

Code


child_counts %>%
  filter(age == "Total, Age 6-21") %>%
  pivot_wider(names_from = gender, 
              values_from = total) %>%
  # Filter for female student counts less than 500,000
  filter(f <= 500000) %>%
  ggplot(aes(x = f, y = m)) +
  geom_point(size = 3, alpha = .5, color = dataedu_colors("green")) +
  labs(
    title = "Comparison of Female Students to Male Students with Disabilities",
    subtitle = "Counts of students in each state, ages 6-21.\nDoes not include outlying areas and freely associated states",
    x = "Female students",
    y = "Male students",
    caption = "Data: US Dept of Education") +
  theme_dataedu()

]

.pull-right[

Plot



] ]

.panel[.panel-name[Create Model Data]

model_data <- child_counts %>%
  filter(age == "Total, Age 6-21") %>%
  mutate(year = as.factor(year(year))) %>%
  pivot_wider(names_from = gender, 
              values_from = total) %>% 
  # Exclude outliers
  filter(f <= 500000) %>%
  # Compute male student to female student ratio
  mutate(ratio = m / f) %>%
  select(-c(age, disability))

model_data %>% count(year)

]

.panel[.panel-name[Visualize Model Data] .pull-left[

Code


ggplot(data = model_data, aes(x = year, y = ratio)) +
  geom_jitter(alpha = .5, color = dataedu_colors("green")) +
  labs(title = "Male to Female Ratio Across Years (Jittered)") +
  theme_dataedu()

]

.pull-right[

Plot



] ] ]


Model Data

ratio_year <- 
  lm(ratio ~ year, data = model_data)

summary(ratio_year)

Model Data

.pull-left[

Code


model_data %>%
  pivot_longer(cols = c(f, m), 
               names_to = "gender", 
               values_to = "students") %>% 
  ggplot(aes(x = year, y = students, color = gender)) +
  geom_boxplot() +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "Median Male and Female Student Counts in Special Education",
    subtitle = "Ages 6-21. Does not include outlying areas and freely associated states",
    x = "",
    y = "",
    caption = "Data: US Dept of Education") +
  scale_color_dataedu() +
  theme_dataedu()

]

.pull-right[

Plot



]


background-color: #006876 class: center, middle, inverse

r icons::fontawesome$solid$check


Report Results


.left-column[

Review Results & Next Steps

]

.right-column[

What'd we learn?

We learned from that male to female ratios did not change in any meaningful way from 2012 to 2017 and that the median ratio across states was about two male students to every female student.

  1. Each state has a different count of students with disabilities–so different that we need to use statistics like ratios or visualizations to compare across states.

  2. The five states we looked at have increased their student with disabilities counts.

  3. Our model suggests that these decreases do not represent a big difference.

Conclusion

  1. Further analyses: While we explored student counts across each state and verified that there is variation in the counts, a good next step would be to combine these data with total enrollment data. What other analyses should we run to add to this linear regression?

  2. Contextual data: What is happening in these states around students with disabilities (policy? systems?) that we could use to build upon our results?

  3. What other questions are raised from this analysis and what steps should we take in the future?

]


background-color: #006876 class: center, middle, inverse

Questions?


An Introduction to xaringan for Presentations: The Basics and Beyond by Dr. Silvia Canelón



r4ds/bookclub-dsieur documentation built on May 20, 2022, 6:24 p.m.